# Load packages
library(here)
library(tidyverse)
library(lubridate)
library(kableExtra)
library(countrycode) # to code region below
library(janitor)
library(stringi)

start_date = ymd("2008-07-01")
end_date = ymd("2023-03-15")


# APSR office closures 
# September 7-14, 2020 (APSA)
# December 23 2020-Jan 6 2021 (winter break)
# September 27 - October 3, 2021 (APSA)
# December 23 2021-Jan 6 2022 (winter break)
# September 11 - 19, 2022 (APSA)
# December 23 2022-Jan 6 2023 (winter break)
# August 28 -  September 3, 2023 (APSA)
# December 23 2023-Jan 6 2024 (winter break)
reason_close = c("APSA2020", 
                 "Winter2020", 
                 "APSA2021",
                 "Winter2021",
                 "APSA2022",
                 "Winter2022",
                 "APSA2023", 
                 "Winter2023")
start_close = c(ymd("2020-09-07", 
                    "2020-12-23", 
                    "2021-09-27", 
                    "2021-12-23", 
                    "2022-09-11", 
                    "2022-12-23",
                    "2023-08-28", 
                    "2023-12-23"))
end_close = c(ymd("2020-09-14", 
                  "2021-01-06",
                  "2021-10-03",
                  "2022-01-06",
                  "2022-09-19",
                  "2023-01-06",
                  "2023-09-03",
                  "2024-01-06"))
officeclosed <- tibble(reason_close, start_close, end_close)

# Set functions
# A function to filter out columns with all NA rows
all_na <- function(x) any(!is.na(x))


# The goal of this chunk is to create a main dataset where each revision for a ms
# is a row (eg. 4 R&R = 5 rows)

###################################################
# Import and join relevant data files
###################################################

###################################################
# Import author & submission file 
# This file has one row per REVISION per Manuscript Number 
###################################################

# Download from: Reports - Custom Report - New Report - Authors & Submissions View - 'Download the data file in Unicode format - UTF-8'
sub.all <- read_delim(here::here("data", "ROLEAUTH_DOC_PEOPLE_ADDR_UNICODE.tab"), 
                  delim = "\t", quote = "", locale = default_locale(), 
                  guess_max = 20000) %>%
  select_if(all_na) %>% # remove columns that are all NA
  clean_names() %>% #default is snakecase
  rename(revision = revision_number,  # rename key variables
         man_num = manuscript_number) %>%
  relocate(man_num, revision)
# write_csv(head(sub.all,6000),"sub.all.uncoded.csv")

sub.all <- sub.all %>%
 mutate_at(c("initial_date_submitted", "first_receipt_date", "editorial_status_date",
             "edit_submission_status_date", "date_final_disposition_set",
             "final_decision_date", "date_submission_began", "date_of_first_decision",
             "revision_due_date", "date_revision_assigned"),
           ~as_date(mdy_hms(.)), na.rm = TRUE) %>%
  select(-date_revision_submitted) # remove  field w/bad data

# head(sub.all$`All Authors`) # authors divided by ;

# number of ms without man number - am going to exclude b/c I don't think they "count" as submissions
sum(is.na(sub.all$man_num)) # 172

# remove records w/o man num
sub.all = sub.all %>%
  filter(!is.na(man_num))

sub.all = unique(sub.all) #preserves first row only if multiple identical rows

nrow(sub.all)


###################################################
# NOTE: There's an apparent error in the download above.
# Date Revision Submitted should change for each revision, but it does not
# The Authors & Submissions download has a similar problem. 
# Correct field values download with the Editors & Submission report
# selectively import those fields below to add to main file
###################################################

# Download from: Reports - Custom Report - New Report - Editors & Submissions View - 'Download the data file in Unicode format - UTF-8' 
#####     ######    #####     ######
##### select only the fields below!!! ######
#####     ######    #####     ######
### the download includes reviewer and other comments that need to be parsed carefully to import. 

#Re-ordered these for easy(er) download
 
sub.eds.dates = read_delim(here::here("data", "ROLEEDIT_DOC_PEOPLE_ADDR_AUTH_UNICODE.tab"), 
                      col_types = cols_only("Editor Start Date" = col_character(),
                                            "Editor Stop Date" = col_character(),
                                            "Editor People ID" = col_double(),
                                            "Decision Term" = col_character(),
                                            "Days with Editor" = col_double(),
                                            "Decision in Progress" = col_logical(),
                                            "Decision Finalized" = col_logical(),
                                            "Date Editor Accepted Assignment" = col_character(),
                                            "Last Name Assigning Editor" = col_character(),
                                            "Previously Assigned Editor(s)" = col_character(),
                                            "Last Name Editor Assigned To" = col_character(),
                                            "Handling Editor Last Name" = col_character(),
                                            "Last Name of the First Assigned Editor" = col_character(),
                                            "Last Name of the Editor who made First Decision" = col_character(),
                                            "Last Name of the Editor who made Final Decision" = col_character(),
                                            "Editor Last Name" = col_character(),
                                            "Editor Description" = col_character(),
                                            "Manuscript Number" = col_character(),
                                            "Revision Number" = col_double(),
                                            "First Decision Term" = col_character(),
                                            "Date Revision Submitted" = col_character()),
                      delim = "\t", quote = "", locale = default_locale(),
                      guess_max = 20000) 

# prepare for merge
sub.eds.dates = sub.eds.dates %>%
  clean_names() %>%
  rename(man_num = manuscript_number, 
         revision = revision_number) %>%
  relocate(man_num, revision) %>%
  filter(!is.na(man_num)) %>% #get rid of NAs
  mutate(date_revision_submitted = as_date(mdy_hms(date_revision_submitted)),
         editor_start_date = as_date(mdy_hms(editor_start_date))) 

sub.eds.dates = sub.eds.dates %>%
  group_by(.dots=c("man_num","revision")) %>% #.dots way to make it a joint group_by key
  arrange(desc(days_with_editor), .by_group = TRUE) %>% #found duplicates
  slice(n =1) %>% # keep only first row if multiple - there seem to be some dupes - unable to figure out why?
  ungroup()

# merge  
sub.all = sub.all %>%
  left_join(sub.eds.dates, by = c("man_num", "revision"))

#sub.all is one row per revision per ms. it has the correct date revision submitted. 

sub.all.bak = sub.all 


###################################################
# Import & merge Flags for author verification check 
###################################################

# Get from Custom Report -> New Report -> Submission Flag History View

flag <- read_delim(here::here("data","CUSTOM_FLAG_ASSIGNMENT_HISTORY_VIEW_UNICODE.tab"),
                  delim = "\t", quote = "", locale = default_locale(), 
                  guess_max = 20000)

# Pull in the author verification complete flag 

flag = flag %>%
  clean_names() %>% # Snaking names
  rename(revision = revision_number,
         man_num = manuscript_number) %>%
  filter(!is.na(man_num)) %>% # Get rid of those without man #
  mutate(date_flag_set = as_date(mdy_hms(date_flag_set))) %>% #clean date flag set
  unique() %>%
  relocate(man_num, revision)

# need to filter "Co-Author Verification Complete" flag from flag$`Submission Flag Name` & 
# select Date Flag Set
flag.auths = flag %>%
  filter(submission_flag_name =="Co-Author Verification Complete") %>%
  select(man_num, revision, date_flag_set) %>%
  rename(coauth_verif_date = date_flag_set) %>%
  unique() 

# clean up a bunch of flags set by Mannheim team just before transition
flag.auths = flag.auths %>%
  mutate(coauth_verif_date = na_if(coauth_verif_date,"2020-05-29"), 
         coauth_verif_date = na_if(coauth_verif_date,"2020-05-31")) %>%
  filter(!is.na(coauth_verif_date)) #remove empty

# then match on "Manuscript Number" & Revision  
sub.all = sub.all %>%
  left_join(flag.auths, by = c("man_num", "revision")) #join back to main one

################################################
# clean up corrected revision submission date
################################################
sub.all = sub.all %>%
  mutate(date_rev_sub_corrected = if_else((!is.na(coauth_verif_date) & date_revision_submitted <= coauth_verif_date), 
                                          coauth_verif_date, date_revision_submitted))


################################################
# additional corrections due to APSR office closure(s)
################################################
sub.all = sub.all %>% 
  mutate(date_rev_sub_corrected = if_else(date_rev_sub_corrected >= officeclosed$start_close[1] & date_rev_sub_corrected <= officeclosed$end_close[1], officeclosed$end_close[1]+1, date_rev_sub_corrected), 
         date_rev_sub_corrected = if_else(date_rev_sub_corrected >= officeclosed$start_close[2] & date_rev_sub_corrected <= officeclosed$end_close[2], officeclosed$end_close[2]+1, date_rev_sub_corrected), 
         date_rev_sub_corrected = if_else(date_rev_sub_corrected >= officeclosed$start_close[3] & date_rev_sub_corrected <= officeclosed$end_close[3], officeclosed$end_close[3]+1, date_rev_sub_corrected), 
         date_rev_sub_corrected = if_else(date_rev_sub_corrected >= officeclosed$start_close[4] & date_rev_sub_corrected <= officeclosed$end_close[4], officeclosed$end_close[4]+1, date_rev_sub_corrected), 
         date_rev_sub_corrected = if_else(date_rev_sub_corrected >= officeclosed$start_close[5] & date_rev_sub_corrected <= officeclosed$end_close[5], officeclosed$end_close[5]+1, date_rev_sub_corrected), 
         date_rev_sub_corrected = if_else(date_rev_sub_corrected >= officeclosed$start_close[6] & date_rev_sub_corrected <= officeclosed$end_close[6], officeclosed$end_close[6]+1, date_rev_sub_corrected), 
         date_rev_sub_corrected = if_else(date_rev_sub_corrected >= officeclosed$start_close[7] & date_rev_sub_corrected <= officeclosed$end_close[7], officeclosed$end_close[7]+1, date_rev_sub_corrected), 
         date_rev_sub_corrected = if_else(date_rev_sub_corrected >= officeclosed$start_close[8] & date_rev_sub_corrected <= officeclosed$end_close[8], officeclosed$end_close[7]+1, date_rev_sub_corrected))


# cleanup
rm(flag, flag.auths, sub.eds.dates)

sub.all.bak = sub.all


###################################################
# Generate academic year terms and teams factors
###################################################

# This chunk generates factors assigned to manuscripts according to the term they came into the system, 
# and the team that managed the initial submission.

sub.all = sub.all %>%
  filter(date_rev_sub_corrected > start_date) %>% # term started July 1
  mutate(ac_year = as_factor(case_when(
                             date_rev_sub_corrected >= mdy(07012008) & date_rev_sub_corrected <= mdy(06302009) ~ "2008-09",
                             date_rev_sub_corrected >= mdy(07012009) & date_rev_sub_corrected <= mdy(06302010) ~ "2009-10",
                             date_rev_sub_corrected >= mdy(07012010) & date_rev_sub_corrected <= mdy(06302011) ~ "2010-11",
                             date_rev_sub_corrected >= mdy(07012011) & date_rev_sub_corrected <= mdy(06302012) ~ "2011-12",
                             date_rev_sub_corrected >= mdy(07012012) & date_rev_sub_corrected <= mdy(06302013) ~ "2012-13",
                             date_rev_sub_corrected >= mdy(07012013) & date_rev_sub_corrected <= mdy(06302014) ~ "2013-14",
                             date_rev_sub_corrected >= mdy(07012014) & date_rev_sub_corrected <= mdy(06302015) ~ "2014-15",
                             date_rev_sub_corrected >= mdy(07012015) & date_rev_sub_corrected <= mdy(06302016) ~ "2015-16",
                             date_rev_sub_corrected >= mdy(07012016) & date_rev_sub_corrected <= mdy(06302017) ~ "2016-17",
                             date_rev_sub_corrected >= mdy(07012017) & date_rev_sub_corrected <= mdy(06302018) ~ "2017-18",
                             date_rev_sub_corrected >= mdy(07012018) & date_rev_sub_corrected <= mdy(06302019) ~ "2018-19",
                             date_rev_sub_corrected >= mdy(07012019) & date_rev_sub_corrected <= mdy(05312020) ~ "2019-20",
                             date_rev_sub_corrected >= mdy(06012020) & date_rev_sub_corrected <= mdy(05312021) ~ "2020-21", # June 1
                             date_rev_sub_corrected >= mdy(06012021) & date_rev_sub_corrected <= mdy(05312022) ~ "2021-22", 
                             date_rev_sub_corrected >= mdy(06012022) & date_rev_sub_corrected <= mdy(05312023) ~ "2022-23", 
                             date_rev_sub_corrected >= mdy(06012023) & date_rev_sub_corrected <= mdy(05312024) ~ "2023-24")), 
         team = as_factor(case_when(ac_year %in% c("2008-09", "2009-10", "2010-11", "2011-12") ~ "UCLA",
                                    ac_year %in% c("2012-13", "2013-14", "2014-15", "2015-16") ~ "UNT",
                                    ac_year %in% c("2016-17", "2017-18", "2018-19", "2019-20") ~ "Mannheim",
                                    ac_year %in% c("2020-21", "2021-22", "2022-23", "2023-24") ~ "Current")),
         ac_year = fct_relevel(ac_year, "2008-09", "2009-10", "2010-11", "2011-12", "2012-13" , "2013-14", "2014-15", "2015-16", "2016-17", "2017-18", "2018-19" ,  "2019-20" , "2020-21",  "2021-22"),
         new = if_else(revision == 0,1,0), #is this a new submission
         number_of_authors = 1 +(str_count(all_authors, pattern = ";"))) #counts the number of authors by unique string in all_authors

######################################
# ad hoc corrections for anomalies
######################################
sub.all = sub.all %>%
  mutate(initial_date_submitted = if_else(man_num == "APSR-D-18-01004", date_submission_began, initial_date_submitted),
         date_revision_submitted = if_else(man_num == "APSR-D-18-01004", date_submission_began, date_revision_submitted),
         date_rev_sub_corrected = if_else(man_num == "APSR-D-18-01004", date_submission_began, date_rev_sub_corrected))

sub.all.bak = sub.all

kable(table(sub.all$team, exclude = NULL), format = "simple") #number of submission by team
kable(table(sub.all$ac_year, exclude = NULL), format = "simple") #number of submissions by year
kable(table(sub.all$ac_year, sub.all$team, exclude = NULL), format = "simple") #number of submissions by year



###################################################
# Import & merge submission questionnaire info - 
###################################################
# answers to new team questions

# Only 1 row per manuscript in general 
# Does not include a Revision # for the questionaire
###################################################
# Download from: Reports - Custom Report - New Report - Submissions and Author Questionnaires - 'Download the data file in Unicode format - UTF-8'
sub.quest <- read_delim(here("data","SUBMISSIONS_AUTHOR_QUESTIONNAIRES_VIEW_UNICODE.tab"),
                  delim = "\t", quote = "", locale = default_locale(), 
                  guess_max = 150000)

# not many questions here
sub.quest.sm = sub.quest %>% 
  select_if(all_na) %>% # remove columns that are all NA
  clean_names() %>%
  rename(man_num = manuscript_number) %>%
  select(man_num, question_text, author_response,
         asked_on_original, asked_on_revision) %>%
  relocate(man_num) 

kable(table(sub.quest.sm$asked_on_original, sub.quest.sm$asked_on_revision, exclude = NULL), format = "simple")
# authors only got asked either 'asked on original' or 'asked on revision'

sub.quest.sm = sub.quest.sm %>%
  mutate(question_text = if_else(str_detect(question_text, "deposit"),"data_share", question_text),
         question_text = if_else(str_detect(question_text, "Did you generate"),"data_types", question_text),
         question_text = if_else(str_detect(question_text, "perceptions of ethical issues"),"ethics_other", question_text),
         question_text = if_else(str_detect(question_text, "research was reviewed"),"ethics_review", question_text),
         question_text = if_else(str_detect(question_text, "did you obtain"),"ethics_review", question_text),
         question_text = if_else(str_detect(question_text, "conflicts of interest"),"funded_coi", question_text),
         question_text = if_else(str_detect(question_text, "engage human"),"human_participants", question_text),
         question_text = if_else(str_detect(question_text, "involve human"),"human_participants", question_text),
         question_text = if_else(str_detect(question_text, "methodological approach employed"),"methodology", question_text),
         question_text = if_else(str_detect(question_text, "number of pages"),"num_pgs", question_text),
         question_text = if_else(str_detect(question_text, "overleaf"),"overleaf", question_text),
         question_text = if_else(str_detect(question_text, "conferences"),"presented", question_text),
         question_text = if_else(str_detect(question_text, "subfield"),"subfield", question_text)) %>%
  filter(question_text == "data_share" | question_text == "data_types"  
         | question_text == "ethics_other" | question_text == "ethics_review" 
         | question_text == "funded_coi" 
         | question_text == "human_participants" | question_text == "methodology"
         | question_text == "num_pgs" | question_text == "overleaf" 
         | question_text == "presented"  | question_text == "subfield") %>%
  filter(!is.na(author_response)) %>%
  unique() 

#TRV: has michelle grabbed all above classifications? Yes!

### FIXED 4/26/2022
sub.quest.sm = sub.quest.sm %>%
  group_by(.dots=c("man_num","question_text")) %>%
  arrange(desc(author_response),desc(asked_on_revision), .by_group = TRUE) %>%
  slice(n =1) %>% # keep only first row if multiple
  ungroup()

#all questions now cols
sub.quest.sm = sub.quest.sm %>%
  select(-asked_on_revision, -asked_on_original) %>%
  pivot_wider(names_from = question_text, values_from = author_response) 

#clean-up
sub.quest.sm = sub.quest.sm %>%
  mutate(overleaf = if_else(str_detect(overleaf, "Yes"), "yes", overleaf), 
         overleaf = if_else(str_detect(overleaf, "No"), "no", overleaf), 
         overleaf = na_if(overleaf, " - "), 
         methodology = str_replace_all(methodology, fixed(" - "), ""),
         methodology = na_if(methodology, "Not Applicable"), 
         methodology = na_if(methodology, ""),
         subfield = str_replace_all(subfield, fixed(" - "), ""),
         subfield = na_if(subfield, ""),
         data_types = str_replace_all(data_types, fixed("Yes, a quantitative dataset;Yes, qualitative data or evidence"),
                                      "mixed"),
         data_types = str_replace_all(data_types, fixed("Yes, a quantitative dataset"),
                                      "quantitative"),
         data_types = str_replace_all(data_types, fixed("Yes, qualitative data or evidence"),
                                      "qualitative"),
         data_types = str_replace_all(data_types, fixed("No;quantitative"),
                                      "quantitative"),
         data_types = str_replace_all(data_types, fixed("quantitative;quantitative"),
                                      "quantitative"),
         data_types = as_factor(data_types),
         ethics_other = str_replace_all(ethics_other, fixed(" - "), ""),
         human_participants = str_replace_all(human_participants, fixed(" - "), ""),
         data_share = str_replace_all(data_share, fixed(" - "), ""), 
         data_share = na_if(data_share, "No empirical data was analysed in this study"), 
         data_share = if_else(str_detect(data_share, "confirm that"), "Yes, without limitations", data_share), 
         data_share = as_factor(data_share), 
         ethics_review = as_factor(ethics_review),
         funded_coi = str_replace_all(funded_coi, fixed(" - "), ""))

###################################################
# Join submission questions to main sub file 
###################################################
sub.all.bak = sub.all
sub.all = sub.all %>%
  left_join(sub.quest.sm, by = "man_num") 
#all new answers are repeated for each revision, so filter for revision==0 to keep only 1 per ms


###################################################
# Import & merge author demographic info  
###################################################

# Download from: Reports - Custom Report - New Report - Submissions and All Authors Questionnaires - 'Download the data file in Unicode format - UTF-8'
sub.auth.dem <- read_delim(here("data","SUBMISSIONS_ALL_AUTHORS_QUESTIONNAIRES_VIEW_UNICODE.tab"),
                           col_types = cols_only("Manuscript Number" = col_character(),
                                                 "Question Text" = col_character(),
                                                 "Author Response" = col_character(),
                                                 "Asked at Initial Submission" = col_double(),
                                                 "Asked at Revision" = col_double(),
                                                 "Author's People ID" = col_double(),             
                                                 "Revision Independent Author ID" = col_character(),
                                                 "Author Type" = col_character(),
                                                 "Author's Title" = col_character(),
                                                 "Author's First Name" = col_character(),
                                                 "Author's Middle Name" = col_character(),
                                                 "Author's Last Name" = col_character(), 
                                                 "Author's Academic Degree" = col_character(), 
                                                 "Author's E-mail address" = col_character(),
                                                 "Author's Country" = col_character(), 
                                                 "Author's Affiliation" = col_character(),
                                                 "Author's Institution"  = col_character(),        
                                                 "Author's Institution ID" = col_character(),
                                                 "Author's Department" = col_character(),
                                                 "Author's Address Line 1"  = col_character(),
                                                 "Author's Address Line 2" = col_character(),
                                                 "Author's Address Line 3" = col_character(),
                                                 "Author's Address Line 4" = col_character(),
                                                 "Author's City" = col_character(),
                                                 "Author's State/Province" = col_character(),
                                                 "Author's Zip/Postal Code" = col_character(), 
                                                 "Author's Order" = col_double()),
                           delim = "\t", quote = "", locale = default_locale(), 
                           guess_max = 10000)

#row for every ms and every author and every question they have answered; questions only asked after 2016 officially 2018. 
sub.auth.dem = sub.auth.dem %>%
  clean_names() %>%
  select_if(all_na) %>% # remove columns that are all NA
  rename(man_num = manuscript_number) 

###################################################
# check to see if the questions were asked 2x 
table(sub.auth.dem$asked_at_initial_submission, sub.auth.dem$asked_at_revision, exclude = NULL)
###################################################

# colnames(sub.auth)
# need to remove records with NA author responses when there are more than 1 record
# some have 2 flag dates, keep the earlier one only 
sub.auth.dem = sub.auth.dem %>%
  arrange(revision_independent_author_id,question_text) %>%
  group_by(.dots=c("revision_independent_author_id","question_text")) %>%
  slice(n =1 ) %>% # keep only first row if multiple
  ungroup() 

# recode Question Text & reshape to wide
sub.auth.dem.w = sub.auth.dem %>%
  mutate(question_text = recode(question_text, "<b>In 2018, APSA Council approved the collection of demographic information about authors submitting to APSR. This is information is available only to APSA office staff, who use it to provide summary statistics for editorial reports to the membership. The information is not accessed or used in editorial decision-making.</b><br><br>  What is your primary position within the discipline?" = "primary_position", 
         "How would you characterize your home institution?" = "institution_type", 
         "What is your gender identity?" = "gender_identity", 
         "What is your primary position within the discipline?" = "primary_position", 
         "Which of the following best describes your racial or ethnic identification? Check all that apply." = "ethnicity", 
         "Which of the following best describes your racial or ethnic identification? Select all that apply." = "ethnicity")) %>%
  pivot_wider(names_from = question_text, values_from = author_response)
#question to columns and rows still author sub


kable(table(sub.auth.dem.w$authors_order), format = "simple") #take max gives you author numbers

# colnames(sub.auth.dem.w)
# head(sub.auth.dem.w)
# write.csv(sub.auth.dem.w, "sub.auth.dem.w.csv")

sub.auth.dem.sm = sub.auth.dem.w %>%
  select (man_num, authors_people_id, authors_order, authors_country, 
          authors_academic_degree, institution_type, gender_identity,
          primary_position, ethnicity) %>% #TRV makes sure we keep required selected. Yes!
  filter(!is.na(man_num)) %>%  # filter out manuscripts never assigned # under Mannheim team
  mutate(gender_identity = fct_recode(gender_identity, "Man" = "Male", "Woman" = "Female"), 
         institution_type = replace_na(institution_type, "Prefer not to answer"), 
         gender_identity = replace_na(gender_identity, "Prefer not to answer"),
         primary_position = replace_na(primary_position, "Prefer not to answer"), 
         ethnicity = replace_na(ethnicity, "Prefer not to answer")) # need to clean up a handful of early 
# records where questions were not mandatory
# this way - all rows have no NAs for survey question responses 
# that way when we remerge back to the main file - we can distinguish between NA b/c 
# did not answer survey and no answers. - md

sub.auth.dem.sm = sub.auth.dem.sm %>%
  mutate(authors_academic_degree = tolower(str_replace_all(authors_academic_degree, "[[:punct:]]", "")), #clean author degree
         authors_academic_degree = str_replace_all(authors_academic_degree, fixed(" "), ""),
         authors_academic_degree = if_else(str_detect(authors_academic_degree,"ba"),"pre-doc",authors_academic_degree),
         authors_academic_degree = if_else(str_detect(authors_academic_degree,"bs"),"pre-doc",authors_academic_degree),
         authors_academic_degree = if_else(str_detect(authors_academic_degree, "ma"),"pre-doc", authors_academic_degree),
         authors_academic_degree = if_else(str_detect(authors_academic_degree, "cand"),"pre-doc", authors_academic_degree),
         authors_academic_degree = if_else(str_detect(authors_academic_degree, "mphil"),"pre-doc", authors_academic_degree),
         authors_academic_degree = if_else(str_detect(authors_academic_degree, "master"),"pre-doc", authors_academic_degree),
         authors_academic_degree = if_else(str_detect(authors_academic_degree, "mba"),"pre-doc", authors_academic_degree),
         authors_academic_degree = if_else(str_detect(authors_academic_degree, "ms"),"pre-doc", authors_academic_degree),
         authors_academic_degree = if_else(str_detect(authors_academic_degree, "mp"),"pre-doc", authors_academic_degree),
         authors_academic_degree = if_else(str_detect(authors_academic_degree, "student"),"pre-doc", authors_academic_degree),
         authors_academic_degree = if_else(str_detect(authors_academic_degree, "jd"),"pre-doc", authors_academic_degree),
         authors_academic_degree = if_else(str_detect(authors_academic_degree, "ll"),"pre-doc", authors_academic_degree),
         authors_academic_degree = if_else(str_detect(authors_academic_degree, "abd"),"pre-doc", authors_academic_degree),
         authors_academic_degree = if_else(str_detect(authors_academic_degree, "phd"),"doc", authors_academic_degree),
         authors_academic_degree = if_else(str_detect(authors_academic_degree, "doctor"),"doc", authors_academic_degree),
         authors_academic_degree = if_else(str_detect(authors_academic_degree, "dphil"),"doc", authors_academic_degree),
         authors_academic_degree = if_else(str_detect(authors_academic_degree, "dr"),"doc", authors_academic_degree),
         authors_academic_degree = if_else(str_detect(authors_academic_degree, "assistant"),"doc", authors_academic_degree),
         authors_academic_degree = if_else(str_detect(authors_academic_degree, "associate"),"doc", authors_academic_degree),
         authors_academic_degree = if_else(str_detect(authors_academic_degree, "postdoc"),"doc", authors_academic_degree),
         authors_academic_degree = if_else(str_detect(authors_academic_degree, "lectur"),"doc", authors_academic_degree),
         authors_academic_degree = if_else(str_detect(authors_academic_degree, "prof"), "doc", authors_academic_degree), 
         authors_academic_degree = as_factor(authors_academic_degree), 
         authors_academic_degree = fct_lump_n(authors_academic_degree, n = 2), #all others to Other
         authors_academic_degree = fct_recode(authors_academic_degree, "pre-doc" = "Other"), 
         gender_identity = na_if(gender_identity, "Prefer not to answer"), 
         gender_identity = as_factor(gender_identity), 
         primary_position = na_if(primary_position, "Prefer not to answer"), 
         institution_type = na_if(institution_type, "Prefer not to answer"), 
         institution_type = if_else(str_detect(institution_type, "College"),"College",institution_type),
         institution_type = fct_lump_n(institution_type, n=3), #combine a new Other
         primary_position = if_else(str_detect(primary_position, "Adjunct"),"untenured", primary_position),
         primary_position = if_else(str_detect(primary_position, "Assistant"),"untenured", primary_position),
         primary_position = if_else(str_detect(primary_position, "Non-Tenure"),"untenured", primary_position),
         primary_position = if_else(str_detect(primary_position, "Fellow"),"untenured", primary_position),
         primary_position = if_else(str_detect(primary_position, "Administration"),"tenured", primary_position),
         primary_position = if_else(str_detect(primary_position, "with tenure"),"tenured", primary_position),
         primary_position = if_else(str_detect(primary_position, "Full"),"tenured", primary_position),
         primary_position = if_else(str_detect(primary_position, "Independent"),"other", primary_position),
         primary_position = if_else(str_detect(primary_position, "Government"),"other", primary_position),
         primary_position = if_else(str_detect(primary_position, "Other"),"other", primary_position),
         primary_position = if_else(str_detect(primary_position, "Research"),"other", primary_position),
         primary_position = if_else(str_detect(primary_position, "Retired"),"other", primary_position),
         primary_position = if_else(str_detect(primary_position, "Student"),"student", primary_position),
         primary_position = as_factor(primary_position), 
         eth_other = if_else(str_detect(ethnicity, "Other"),TRUE, FALSE),
         eth_blk = if_else(str_detect(ethnicity, "Black"),TRUE, FALSE),
         eth_lat = if_else(str_detect(ethnicity, "Latin"),TRUE, FALSE),
         eth_ind = if_else(str_detect(ethnicity, "Indigenous"), TRUE, FALSE),
         eth_mideast = if_else(str_detect(ethnicity, "Arab"),TRUE, FALSE),
         eth_asian = if_else(str_detect(ethnicity, "Asian"),TRUE, FALSE),
         eth_white = if_else(str_detect(ethnicity, "White"),TRUE, FALSE),
         eth_dna = if_else(str_detect(ethnicity, "Prefer not to answer"), TRUE, FALSE), # treat "prefer not to answer like other answers
         eth_dna = if_else(eth_other == TRUE & # recode those that chose "all"
                                   eth_blk == TRUE &
                                   eth_lat == TRUE &
                                   eth_ind == TRUE &
                                   eth_mideast == TRUE &
                                   eth_asian == TRUE &
                                   eth_white == TRUE, TRUE, eth_dna),
         eth_other = if_else(eth_dna == TRUE, FALSE, eth_other), # may be a better way but 
         eth_blk = if_else(eth_dna == TRUE,FALSE,eth_blk), # this code "fixes" responses for those who
         eth_lat = if_else(eth_dna == TRUE,FALSE,eth_lat), # said prefer not but also picked another option
         eth_ind = if_else(eth_dna == TRUE,FALSE,eth_ind),
         eth_mideast = if_else(eth_dna == TRUE,FALSE,eth_mideast),
         eth_asian = if_else(eth_dna == TRUE,FALSE,eth_asian),
         eth_white = if_else(eth_dna == TRUE,FALSE,eth_white)) %>% 
  # select(-authors_country,-region) %>%
  filter(!is.na(man_num))


###################################################
# notes on ethicity - we recode it into a set of - non-mutually exclusive - dummies. 
# Then, we can report percentages & acknowledge that they don't sum to 
# 100% due to check all that apply. 



####################################
# make new columns w/counts of demographics per ms
####################################

sub.auth.dem.by.man = sub.auth.dem.sm %>%
  group_by(man_num) %>%
  mutate(a_n_predoc = sum(authors_academic_degree=="pre-doc", na.rm=TRUE), 
         a_n_doc = sum(authors_academic_degree=="doc", na.rm=TRUE),
         a_n_college = sum(institution_type=="College", na.rm=TRUE),
         a_n_uni_phd = sum(institution_type=="University with Ph.D. in Political Science", na.rm=TRUE),
         a_n_uni_nophd = sum(institution_type=="University without Ph.D. in Political Science", na.rm=TRUE),
         a_n_other_inst = sum(institution_type=="Other", na.rm=TRUE), 
         a_n_men_id = sum(gender_identity=="Man", na.rm=TRUE),
         a_n_women_id = sum(gender_identity=="Woman", na.rm=TRUE),
         a_n_nonbin_id =sum(gender_identity=="Non-binary", na.rm=TRUE),
         a_n_tenured = sum(primary_position=="tenured", na.rm=TRUE),
         a_n_untenured = sum(primary_position=="untenured", na.rm=TRUE),
         a_n_student = sum(primary_position=="student", na.rm=TRUE),
         a_n_other_pos = sum(primary_position=="other", na.rm=TRUE),
         a_n_white = sum(eth_white==TRUE & eth_dna == FALSE, na.rm=TRUE), # added code to exclude DNA just in case
         a_n_latinx = sum(eth_lat==TRUE & eth_dna == FALSE, na.rm=TRUE),
         a_n_asian = sum(eth_asian==TRUE & eth_dna == FALSE, na.rm=TRUE),
         a_n_other = sum(eth_other==TRUE & eth_dna == FALSE, na.rm=TRUE),
         a_n_indig = sum(eth_ind==TRUE & eth_dna == FALSE, na.rm=TRUE),
         a_n_black = sum(eth_blk==TRUE & eth_dna == FALSE, na.rm=TRUE),
         a_n_mideast = sum(eth_mideast==TRUE & eth_dna == FALSE, na.rm=TRUE), 
         a_n_eth_dna = sum(eth_dna == TRUE)) %>% # added column to count DNAs - need it for totals when returning to sub.all
  ungroup() %>%
  select(man_num, a_n_predoc:a_n_eth_dna) %>%
  unique()


###################################################
# add demo data back to main submission df
###################################################
sub.all.bak = sub.all 
sub.all = sub.all  %>%
  left_join(sub.auth.dem.by.man, by = "man_num")

# clean up intermediate dfs
rm(sub.auth.dem.w, sub.quest, sub.quest.sm, sub.auth.dem.by.man) # note keep sub.auth.dem.sm for next section

sub.all.bak = sub.all


###################################################
#  gender database
###################################################
# In the next section, we are able to match individual people records to 
# the genderizer.io generated database (gender_coded.csv) because
# each reviewer record includes the global people ID. 
# HOWEVER, *before* January 2018, the people ID is only available for 
# the corresponding author. 
# After January 2018, it is available for all authors.
###################################################

###################################################
# import the gender database 
# created by gender_database.Rmd
###################################################
# read the databse
genderize = read_csv(here("data","gender","people_gender.csv"), guess_max = 45000)

# select only key variables 
gender = genderize %>%
  rename(author_people_id = people_unique_id) %>%
  select(author_people_id, gender_api, gender_confid, gender_title, gender_selfid, self_title, gender_coded) %>%
  unique()


###################################################
# code corresponding author by people ID
###################################################
###################################################
# this is corresponding author whole time series
###################################################
sub.all = sub.all %>%
  left_join(gender, by = "author_people_id") %>%
  rename(corr_gender_api = gender_api,
         corr_gender_confid = gender_confid, 
         corr_gender_title = gender_title, 
         corr_gender_selfid = gender_selfid,
         corr_self_title = self_title, 
         corr_author_coded = gender_coded) 


###################################################
# function used in the gender database .Rmd to clean names
###################################################
clean.names <- function(x, delete.missing=FALSE){
  str_sub(x,1,1) <- toupper(str_sub(x,1,1)) # make sure first character is capitalized
  names <- str_extract(x, "[[:upper:]]{1}([[:lower:]]|[[:upper:]])+") # extract the first part of the names
  names <- stri_trans_general(names, "latin-ascii") # clean special characters
  if(delete.missing==TRUE){names <- names[is.na(names)==FALSE]} # delete missings
  return(names)
}


###################################################
# code all authors by first middle last name match
###################################################
# prepare gender file for matching
###################################################
gender_names = genderize %>%
  mutate(lastname_clean = clean.names(last_name)) %>%
  group_by(.dots=c("firstname_clean", "middlename_clean", "lastname_clean")) %>%
  arrange(firstname_clean, middlename_clean, lastname_clean, desc(people_record_last_update_date)) %>%
  slice(n = 1) %>%  # keep only first row if multiple <- given how gender_coded is setup, this keeps most data
  unique() %>%
  ungroup()

gender_names = gender_names %>%
  select(firstname_clean, middlename_clean, lastname_clean, gender_api, gender_confid, 
         gender_title, gender_selfid, self_title, gender_coded) %>%
  unique() 


###################################################
# prepare sub.all for matching 
###################################################
head(sub.all$all_authors_and_contributor_roles)
# format just authors by role manuscript, revision and names 
sub.all.authors = sub.all %>%
  select(man_num, revision, all_authors_and_contributor_roles) %>%
  mutate(all_authors_and_contributor_roles = str_split(all_authors_and_contributor_roles,";")) %>%
  unnest(all_authors_and_contributor_roles) %>% 
  group_by(.dots=c("man_num","revision")) %>%
  mutate(author_num = row_number()) %>% # add author order
  ungroup %>%
  separate(all_authors_and_contributor_roles, into = c("name",NA) , sep = ",") %>% # remove everything after , throws warning, but ok
  mutate(name = str_remove(name, "\\."), 
         name = str_replace(name, "\\s[A-Z]\\s", " "), 
         name = str_trim(name, side = "both"), 
         first_name = word(name,1), # extract first name
         last_name = word(name,-1), # extract last name 
         middle_name = if_else(str_count(name, '\\w+') > 2,word(name,2),NA_character_), # extract the 2nd word
         middle_name = if_else(middle_name == last_name,NA_character_,middle_name), # replace w/NA if really family name
         firstname_clean = clean.names(first_name), 
         middlename_clean = clean.names(middle_name), 
         lastname_clean = clean.names(last_name)) %>% 
  select(-first_name, -middle_name, -last_name) %>%
  unique() 

sub.all.authors.temp = sub.all.authors %>%
  left_join(gender_names, by = c("firstname_clean", "middlename_clean", "lastname_clean")) 

sub.all.authors.na = sub.all.authors.temp %>%
  filter(is.na(sub.all.authors.temp$gender_coded))

# now match back to the sub.all - but count up author genders and/or recode to mixed, male solo, etc. 


###################################################
### make a series of files to match back to sub.all
###################################################
# post 2018 author's demo questionnaire  
###################################################
sub.auth.dem.sm = sub.auth.dem.sm %>%
  rename(author_people_id = authors_people_id)
gender.sub.auth.dem = sub.auth.dem.sm %>%
  left_join(gender, by = "author_people_id") %>%
  # select(man_num, authors_order, authors_people_id, gender_api, gender_confid, gender_title, gender_selfid, self_title, gender_coded) %>%
  unique()
# result is file with 1 row per author per ms


###################################################
# match first names of authors
###################################################


###################################################
# match first names if not in gender db
###################################################

###################################################
# read the databse
genderize = read_csv(here("data","gender","people_gender.csv"), guess_max = 45000)
# select only key variables 
genderize = genderize %>%
  select(firstname_clean, country_iso2c, pr1, gender_coded) %>%
  unique() %>%
  group_by(firstname_clean) %>%
  arrange(firstname_clean, desc(pr1)) %>%
  slice(n =1) %>%  # keep only first row if multiple 
  ungroup() %>%
  select(firstname_clean, gender_coded) 

###################################################
# match on first name 
gender.sub.all.auth = sub.all.authors %>%
  left_join(genderize, by = "firstname_clean")

###################################################
# NOTE: the column above has the "title" for each other
# this could be used to code phd/not - but will need 
# some care to clean
###################################################

###################################################
# match all these back to main sub.all 
###################################################
# 1. Those since 2018 w/author demographic questionaire
###################################################
# these are the most accurate b/c they include country w/gender prediction
gender.post2018 = gender.sub.auth.dem %>%
  ungroup() %>%
  select(-author_people_id) %>%
  arrange(man_num, authors_order) %>%
  group_by(man_num) %>%
  summarize(n_men_coded1 = sum(gender_coded == "man", na.rm=TRUE),
         n_women_coded1 = sum(gender_coded == "woman", na.rm=TRUE), 
         first_author_coded1 = if_else(authors_order == 1,gender_coded, "")) %>%
  slice(n=1)
###################################################
# 2. pre 2018: other authors by first name
###################################################
# only match if is NA after above matches
# these are more complete but less accurate b/c 
# they match only on name w/o country
gender.pre2018 = gender.sub.all.auth %>%
  select(man_num, revision, author_num, gender_coded) %>%
  arrange(man_num, revision, author_num) %>%
  group_by(.dots=c("man_num","revision")) %>%
  summarize(n_men_coded2 = sum(gender_coded == "man", na.rm=TRUE),
            n_women_coded2 = sum(gender_coded == "woman", na.rm=TRUE), 
            first_author_coded2 = case_when(author_num == 1 ~ gender_coded)) %>%
  slice(n=1)
###################################################
# merge into main 
sub.all = sub.all %>%
  left_join(gender.post2018, by = "man_num")
sub.all = sub.all %>%
  left_join(gender.pre2018, by = c("man_num", "revision"))
## check 
# temp = sub.all %>% 
#   mutate(pre_minus_post_m = n_male_coded2 - n_male_coded1, 
#          pre_minus_post_f = n_female_coded2- n_female_coded1,
#          pre_post_first = if_else(first_author_coded1 == first_author_coded2, TRUE, FALSE)) 
# temp = temp %>%
#   filter(pre_post_first == FALSE)

# keep post-2018 first, and pre-2018 only when missing
sub.all = sub.all %>%
  mutate(n_men_coded = if_else(is.na(n_men_coded1),n_men_coded2,n_men_coded1), 
         n_women_coded = if_else(is.na(n_women_coded1),n_women_coded2,n_women_coded1),
         first_author_coded = if_else(is.na(first_author_coded1),first_author_coded2,first_author_coded1)) %>%  
  select(-(n_men_coded1:first_author_coded2))


###################################################
# 4. use first author name to match to first name genderize 
###################################################
# read the databse
genderize = read_csv(here("data","gender","people_gender.csv"), guess_max = 45000)
# select only key variables 
genderize = genderize %>%
  select(firstname_clean, country_iso2c, pr1, gender_coded) %>%
  unique() %>%
  group_by(firstname_clean) %>%
  arrange(firstname_clean, desc(pr1)) %>%
  slice(n =1) %>%  # keep only first row if multiple 
  ungroup() %>%
  select(firstname_clean, gender_coded) 

###################################################
# match on first name 
sub.all = sub.all %>%
  mutate(firstname_clean = clean.names(first_author_first_name)) %>%
  left_join(genderize, by = "firstname_clean") %>%
  rename(first_author_coded_2 = gender_coded) %>%
  select(-firstname_clean)

# definitions to be double-checked and added to codebook.xls
# n_male_coded - based on member id after 2018, first name before 2018	
# n_female_coded - based on member id after 2018, first name before 2018
# first_author_coded - based on author order in questionaire after 2018, first name before 2018 
#       question: does the author order in that df represent their order as authors or 
#       is it corresponding and then 2,3,4 etc. ? 
# corresponding_author_coded - based on member id of designated corresponding author for entire period
# first_author_coded_2 - based on first name of designated first author for entire period
#
# by member id means that the country was used in the prediction
# by first name means that country was not used, and instead the most probable gender for that name is used


########################################
# clean up decision terms for use later
# resolve ambiguities based on EM configs
########################################
# mis cleanup that should be merged where relevant in clean code 
sub.all = sub.all %>%
  mutate(decision_term = if_else(decision_term == "[Author Notified by Guest Editor]", # COI guest eds
                                 final_decision_term,decision_term), # use final term instead
         decision_term_corrected = case_when( 
           is.na(decision_term) ~ NA_character_, 
           str_detect(decision_term, "Terminated") ~ "terminated", # usually tech issue unless withdrawn
           str_detect(decision_term, "CA") ~ "accept", 
           str_detect(decision_term, "Accept") ~ "accept", 
           str_detect(decision_term, "Revise") ~ "revise", 
           str_detect(decision_term, "Reject") ~ "reject"), 
         # fix those that were withdrawn @ author request - a mix of terminated/revise 
         decision_term_corrected = if_else(!is.na(final_disposition_term) &  # clean up those w/drawn
                                             final_disposition_term == "Withdrawn", # coded terminated 
                                           "withdrawn", decision_term_corrected),
         decision_term_corrected = as_factor(decision_term_corrected),
         decision_term_corrected = fct_relevel(as_factor(decision_term_corrected), 
                                               "withdrawn", "terminated",
                                               "reject", "revise", "accept"), 
         final_decision_term_corrected = case_when(
           is.na(final_decision_term) ~ NA_character_, 
           str_detect(final_decision_term,"Accept") ~ "accept",
           str_detect(final_decision_term,"Reject") ~ "reject"), 
         date_decision_term = as_date(mdy_hms(editor_stop_date)))


###################################################
# Import & merge reviewer reports
###################################################
# Download from: Reports - Custom Report - New Report - Reviewers & Submissions View - 'Download the data file in Unicode format - UTF-8'
#for each ms and each revision a row for each reviewer, as well as reviewers that were invited and did not complete them, perhaps not even invited

sub.rev <- read_delim(here("data","ROLEREVU_DOC_PEOPLE_ADDR_AUTH_ED_UNICODE.tab"), 
                           delim = "\t", quote = "",  locale = default_locale(), 
                           guess_max = 60000)
sub.rev.bak = sub.rev

# reduce columns to those specific to reviewer (delete dupes at ms level)
sub.rev = sub.rev %>%
  select(!(`Article Title`:Abstract)) %>%
  select(!(`Final Decision Date`:`Grant Recipient(s)`)) %>%
  select(-`Unique Document ID`) %>%
  select(-`Initial Date Submitted`)

# recode and rename variables
sub.rev = sub.rev %>% 
  select_if(all_na) %>% # remove columns that are all NA
  clean_names() %>%
  rename(man_num = manuscript_number,
         revision = revision_number) %>%
  relocate(man_num, revision, reviewer_people_id)
  
sub.rev = sub.rev %>% 
  mutate(date_reviewer_invited = as_date(mdy_hms(date_reviewer_invited)),
         agree_date = as_date(mdy_hms(agree_date)),
         alternate_reviewer_selection_date = as_date(mdy_hms(alternate_reviewer_selection_date)),
         date_review_due = as_date(mdy_hms(date_review_due)), 
         date_last_reminder_sent = as_date(mdy_hms(date_last_reminder_sent)),
         date_completed_after_termination = as_date(mdy_hms(date_completed_after_termination)),
         reviewer_stop_date = as_date(mdy_hms(reviewer_stop_date)),
         date_last_completed_a_review = as_date(mdy_hms(date_last_completed_a_review)),
         invited = if_else(is.na(date_reviewer_invited),FALSE, TRUE),
         accepted = if_else(is.na(agree_date),FALSE, TRUE),
         board_member = if_else(person_is_a_board_member == TRUE,"reviewer_board_member","not_board_member"),
         decline_date = if_else(reviewer_declined_invitation == TRUE, reviewer_stop_date, ymd(NA)),
         # decline_date = as_date(mdy_hms(decline_date)),
         date_completed = if_else(completed_after_termination == TRUE, date_completed_after_termination, if_else(review_complete == TRUE, reviewer_stop_date,ymd(NA))),
         # date_completed = as_date(mdy_hms(date_completed)),
         region = as_factor(countryname(country, destination = "region23")),
         region5 = fct_lump_n(region, n = 5), # combine all other regions into Other
         region3 = fct_recode(region5, "N. America" = "Northern America", Other = "Australia and New Zealand", Europe = "Northern Europe", Europe = "Western Europe", Europe = "Southern Europe"),
         reviewer_recommendation = fct_lump(reviewer_recommendation, n=5), # combine sm into Other
         reviewer_recommendation = fct_collapse(reviewer_recommendation, 
                                                terminated = c("Other", "[Terminated by Editor]"))) %>% # combine sm into Other
  arrange(man_num, revision, reviewer_recommendation)


###################################################
# import and merge the genderize data
###################################################
# read the database
genderize = read_csv(here("data","gender","people_gender.csv"), guess_max = 50000)
# why are there dupes?
# because about 40 people are in the people database 2x - due to how they 
# have updated their files or how the gender database.rmd is set up

# select only key variables 
genderize = genderize %>%
  select(people_unique_id, gender_coded) %>%
  rename(reviewer_people_id = people_unique_id) %>%
  unique()
# join to main author demographic data
sub.rev = sub.rev %>%
  left_join(genderize, by = "reviewer_people_id")


###################################################
# collapse to manuscript revision for merging
###################################################
# md changed the approach here to save the key dates & 
# then do the duration/calculations *after* the merge with the main df 
# so we can use the corrected start date for the submission
###################################################
sub.rev.sm = sub.rev %>%
  group_by(.dots=c("man_num","revision")) %>%
  summarize(rev_n_invited = sum(!is.na(date_reviewer_invited), na.rm=TRUE), 
            rev_n_declined = sum(reviewer_declined_invitation==TRUE, na.rm=TRUE), 
            rev_n_completed = sum(review_complete==TRUE, na.rm=TRUE), 
            rev_n_accepted = sum(accepted==TRUE, na.rm=TRUE),
            rev_n_eb_members = sum(board_member=="is_board_member", na.rm=TRUE), 
            rev_n_men_invited = sum(invited==TRUE & gender_coded=="male", na.rm=TRUE), 
            rev_n_women_invited = sum(invited==TRUE & gender_coded=="female", na.rm=TRUE), 
            rev_n_men_completed = sum(review_complete==TRUE & gender_coded=="male", na.rm=TRUE), 
            rev_n_women_complete = sum(review_complete==TRUE & gender_coded=="female", na.rm=TRUE), 
            rev_n_northamerica_invited = sum(invited==TRUE & region3=="N. America", na.rm=TRUE), 
            rev_n_northamerica_completed = sum(review_complete==TRUE & region3=="N. America", na.rm=TRUE), 
            rev_n_eur_invited= sum(invited==TRUE & region3=="Europe", na.rm=TRUE), 
            rev_n_eur_completed = sum(review_complete==TRUE & region3=="Europe", na.rm=TRUE), 
            rev_n_other_region_invited = sum(invited==TRUE & region3=="Other", na.rm=TRUE), 
            rev_n_other_region_completed = sum(review_complete==TRUE & region3=="Other", na.rm=TRUE), 
            rev_n_ebmember_invited = sum(invited==TRUE & board_member=="is_board_member", na.rm=TRUE), 
            rev_n_ebmember_completed = sum(review_complete==TRUE & board_member=="is_board_member", na.rm=TRUE), 
            rev_n_terminated = sum(reviewer_recommendation == "terminated", na.rm=TRUE), 
            rev_n_rec_reject = sum(reviewer_recommendation == "Reject", na.rm=TRUE), 
            rev_n_rec_min_rev = sum(reviewer_recommendation == "Minor Revision", na.rm=TRUE), 
            rev_n_rec_maj_rev = sum(reviewer_recommendation == "Major Revision", na.rm=TRUE), 
            rev_n_rec_accept = sum(reviewer_recommendation == "Accept", na.rm=TRUE), 
            date_invited_min = min(date_reviewer_invited, na.rm=TRUE),
            date_accept_min = if_else(rev_n_accepted > 0, min(agree_date), NA_Date_),
            date_completed_max = if_else(rev_n_completed > 0, max(date_completed), NA_Date_),
            date_completed_min = if_else(rev_n_completed > 0, min(date_completed), NA_Date_)) %>%
  ungroup()

###################################################
# Join summary of reviews to the main sub df 
###################################################
# left join
sub.all = sub.all %>%
  left_join(sub.rev.sm, by = c("man_num", "revision"))

#### note: the summary stats above will be appended to records in sub.all
# even if there is not yet a decision made
# Need to either think about how to fix that OR
# Remember to filter out those wihtout decisions yet when calculating stats/using these fields

sub.all.bak =sub.all

###################################################
# calculate date when min completed 3 reviews 
###################################################
completed_reviews = sub.rev %>%
  filter(review_complete == TRUE) %>%
  group_by(.dots=c("man_num","revision")) %>%
  arrange(man_num, revision, date_completed) %>%
  mutate(date_1st_review = date_completed[1], 
         date_2nd_review = date_completed[2], 
         date_3rd_review = date_completed[3], 
         date_4th_review = date_completed[4]) %>% # pick value in the 3rd row
  ungroup() %>%
  select(man_num, revision, date_1st_review, date_2nd_review, date_3rd_review, date_4th_review) %>%
  unique()

#### merge to main file 
sub.all = sub.all %>%
  left_join(completed_reviews, by = c("man_num", "revision"))

###################################################
# clean up 
rm(sub.rev.bak, sub.rev, sub.rev.sm, completed_reviews)
sub.all.bak = sub.all


######################################
# Calculate duration times - added by md
######################################
sub.all = sub.all %>%
  mutate(days_sub_to_1stinvite = date_invited_min -date_rev_sub_corrected, # Sub to 1st reviewer
         days_sub_to_2complete = date_2nd_review - date_rev_sub_corrected, # Sub to 2 reviews complete
         days_sub_to_3complete = date_3rd_review - date_rev_sub_corrected, # Sub to 3 reviews complete
         days_sub_to_decision = date_decision_term - date_rev_sub_corrected, # Sub to decision
         days_1stinvite_to_2complete = date_2nd_review - date_invited_min, #1st invite to 2 complete
         days_1stinvite_to_3complete = date_3rd_review - date_invited_min, #1st invite to 3 complete
         days_1stinvite_to_decision = date_decision_term - date_invited_min, # 1st invite to decision
         days_2complete_to_decision = date_decision_term - date_2nd_review, # 2 reviews to decision
         days_3complete_to_decision = date_decision_term - date_3rd_review, # 3 reviews to decision 
         # desk reject - from initial submission to reject without review
         days_1stsub_to_desk_reject = ifelse(decision_term_corrected == "reject" & is.na(rev_n_invited),date_decision_term-date_rev_sub_corrected,NA_POSIXct_),
         # for each revision - from revision submision to reject after review
         days_sub_to_reject_aft_review = ifelse(decision_term_corrected == "reject" & rev_n_invited >0, date_decision_term-date_rev_sub_corrected,NA_POSIXct_),
         # for first submission - from submision to reject after review
         days_1stsub_to_reject_aft_review = ifelse(decision_term_corrected == "reject" & rev_n_invited >0 & revision == 0, date_decision_term-date_rev_sub_corrected,NA_POSIXct_),
         # for each revision - from revision submision to revise 
         days_sub_to_revise = ifelse(decision_term_corrected == "revise", date_decision_term-date_rev_sub_corrected,NA_POSIXct_),
         # for first submission - from submision to revise 
         days_1stsub_to_revise = ifelse(decision_term_corrected == "revise" & revision == 0, date_decision_term - date_rev_sub_corrected, NA_POSIXct_),
         # for each revision - from revision submission to accept 
         days_sub_to_accept = ifelse(decision_term_corrected == "accept", date_decision_term - date_rev_sub_corrected,NA_POSIXct_), 
         # for overall process
         days_1stsub_to_final_decision = ifelse(revision == 0, final_decision_date - date_rev_sub_corrected, NA_POSIXct_), 
         days_1stsub_to_final_accept = ifelse(final_decision_term_corrected == "accept" & revision == 0, final_decision_date - date_rev_sub_corrected,NA_POSIXct_), 
         days_1stsub_to_final_reject = ifelse(final_decision_term_corrected == "reject" & revision == 0, final_decision_date - date_rev_sub_corrected,NA_POSIXct_)) 

# check the results - make sure no negative #s
# temp <- sub.all %>%
#   filter(revision == 0) %>%
#   group_by(team) %>%
#   summarize(across(starts_with("days_"), min, na.rm = TRUE)) %>%
#   as_tibble()

################################################
# clean up handful that have 3rd review *after* decision date & also other decisions after submission
################################################
sub.all = sub.all %>%
  mutate(days_3complete_to_decision = if_else(days_3complete_to_decision <0, as.difftime(0, units = "days"), days_3complete_to_decision),
         days_sub_to_1stinvite = if_else(days_sub_to_1stinvite <0, as.difftime(0, units = "days"), days_sub_to_1stinvite),
         days_sub_to_decision = if_else(days_sub_to_decision <0, as.difftime(0, units = "days"), days_sub_to_decision),
         days_1stsub_to_desk_reject = if_else(days_1stsub_to_desk_reject <0, as.difftime(0, units = "days"), days_1stsub_to_desk_reject),
         days_1stsub_to_final_decision = if_else(days_1stsub_to_final_decision <0, as.difftime(0, units = "days"), days_1stsub_to_final_decision),
         days_1stsub_to_final_reject = if_else(days_1stsub_to_final_reject <0, as.difftime(0, units = "days"), days_1stsub_to_final_reject))



##########################################
# code author teams
##########################################
sub.all = sub.all %>%
  mutate(author_team_coded = case_when(
    number_of_authors == 1 & n_men_coded == 1 ~ "solo man", 
    number_of_authors == 1 & n_women_coded == 1 ~ "solo woman", 
    number_of_authors > 1 & n_men_coded == number_of_authors ~ "team of men", 
    number_of_authors > 1 & n_women_coded == number_of_authors ~ "team of women",
    number_of_authors > 1 & n_women_coded >=1 & n_men_coded >= 1 ~ "mixed team",
    n_women_coded == 0 & n_men_coded == 0 & number_of_authors == 1 ~ "solo uncoded", 
    number_of_authors > (n_women_coded + n_men_coded) & number_of_authors > 1 ~ "uncoded team"), 
    author_team_3 = case_when(
      author_team_coded == "solo man" | author_team_coded == "team of men" ~ "men", 
      author_team_coded == "solo woman" | author_team_coded == "team of women" ~ "women", 
      author_team_coded == "mixed team" ~ "mixed",  
      author_team_coded == "solo uncoded" | author_team_coded == "uncoded team" ~ "uncoded"))


sub.all = sub.all %>%
  mutate(author_team_id = case_when(
    number_of_authors == 1 & a_n_men_id == 1 ~ "solo man", 
    number_of_authors == 1 & a_n_women_id == 1 ~ "solo woman", 
    number_of_authors > 1 & a_n_men_id == number_of_authors ~ "team of men", 
    number_of_authors > 1 & a_n_women_id == number_of_authors ~ "team of women",
    number_of_authors > 1 & a_n_women_id >=1 & a_n_men_id >= 1 ~ "mixed team",
    a_n_women_id == 0 & a_n_men_id == 0 & number_of_authors == 1 ~ "solo uncoded", 
    number_of_authors > (a_n_women_id + a_n_men_id) & number_of_authors > 1 ~ "uncoded team"), 
    author_team3_id = case_when(
      author_team_id == "solo man" | author_team_id == "team of men" ~ "men", 
      author_team_id == "solo woman" | author_team_id == "team of women" ~ "women", 
      author_team_id == "mixed team" ~ "mixed",  
      author_team_id == "solo uncoded" | author_team_id == "uncoded team" ~ "uncoded"))

sub.all = sub.all %>%
  mutate(author_team_exp = case_when(
    number_of_authors == 1 & a_n_predoc > 0 ~ "Solo pre-doctoral", 
    number_of_authors == 1 & a_n_doc >0 ~ "Solo post-doctoral", 
    number_of_authors > 1 & a_n_predoc == number_of_authors ~ "Team pre-doctoral",
    number_of_authors > 1 & a_n_doc == number_of_authors ~ "Team post-doctoral", 
    number_of_authors > 1 & a_n_predoc >= 1 & a_n_doc >= 1 ~ "Team mixed experience"),
    author_team_org = case_when(
      a_n_college > 0 & number_of_authors == 1  ~ "Solo college", 
      number_of_authors == 1 & a_n_uni_nophd > 0 ~ "Solo university", 
      number_of_authors == 1 & a_n_uni_phd > 0 ~ "Solo PhD university",
      number_of_authors == 1 & a_n_other_inst > 0 ~ "Solo other org", 
      number_of_authors > 1 & a_n_uni_phd == number_of_authors ~ "Team PhD university", 
      number_of_authors > 1 & a_n_uni_nophd == number_of_authors ~ "Team university", 
      number_of_authors > 1 & a_n_college == number_of_authors ~ "Team college", 
      number_of_authors > 1 & a_n_other_inst == number_of_authors ~ "Team other",
      number_of_authors > 1 & a_n_other_inst == number_of_authors ~ "Team other",
      number_of_authors > 1 & (sum(a_n_uni_nophd + a_n_uni_phd + a_n_college + a_n_other_inst) > 1) ~ "Team mixed"), 
    author_team_tenure = case_when(
      number_of_authors == 1 & a_n_tenured > 0 ~ "Solo tenured", 
      number_of_authors == 1 & a_n_untenured >0 ~ "Solo untenured", 
      number_of_authors == 1 & a_n_student > 0 ~ "Solo student", 
      number_of_authors == 1 & a_n_other_pos >0 ~ "Solo other position", 
      number_of_authors > 1 & a_n_tenured == number_of_authors ~ "Team tenured",
      number_of_authors > 1 & a_n_untenured == number_of_authors ~ "Team untenured", 
      number_of_authors > 1 & a_n_student == number_of_authors ~ "Team students",
      number_of_authors > 1 & a_n_other_pos == number_of_authors ~ "Team other positions", 
      number_of_authors > 1 & ((a_n_tenured + a_n_untenured + a_n_student + a_n_other_pos) > 1) ~ "Team mixed positions"),
    author_team_eth = case_when(
      number_of_authors == 1 & a_n_white == number_of_authors ~ "Solo, white author", 
      number_of_authors == 1 & (a_n_latinx >0 | a_n_asian > 0 | a_n_other > 0 | a_n_indig > 0 | a_n_black > 0 | a_n_mideast > 0) ~ "Solo, BIPOC author", 
      number_of_authors == 1 & a_n_eth_dna == 1 ~ "Solo, declined", 
      number_of_authors > 1 & a_n_white == number_of_authors ~ "Team, all white authors", 
      number_of_authors > 1 & (a_n_latinx + a_n_asian + a_n_other + a_n_indig + a_n_black + a_n_mideast) == number_of_authors ~ "Team, all BIPOC authors", 
      number_of_authors > 1 & ((a_n_latinx + a_n_asian + a_n_other + a_n_indig + a_n_black + a_n_mideast) > 0) ~ "Team, 1+ BIPOC authors",
      number_of_authors > 1 & a_n_eth_dna > 0 ~ "Team, 1+ declined"))


######## code region of corresponding author 
sub.all = sub.all %>%
  mutate(corr_author_region = as_factor(countryname(country, destination = "region23")),
         corr_author_region5 = fct_lump_n(corr_author_region, n = 6), # combine all other regions into Other
         corr_author_region3 = fct_recode(corr_author_region5, "N. America" = "Northern America", Other = "Eastern Asia", Europe = "Northern Europe", Europe = "Western Europe", Europe = "Southern Europe", Other = "Western Asia"))


## Merge Classification
class = read_delim(here::here("data", "DOCUMENT_CLASS_UNICODE.tab"), 
                  delim = "\t", quote = "", locale = default_locale(), 
                  guess_max = 60000) %>%
  select_if(all_na) %>% # remove columns that are all NA
  clean_names() %>% #default is snakecase
  rename(man_num = manuscript_number) %>% # rename key variables
  select(man_num, classification_description) %>%
  unique()

# From my looking at the file it seems it only take the classification of the latest submission!
# not a problem per se, but just something to note. 

# Pivot wider doesn't work because don't have name_from, stems in part from issue I note above
class<-class %>%
  group_by(man_num) %>%
  mutate(terms_by_revision = paste0(classification_description, collapse = "; ")) %>%
  slice_head() %>%
  select(man_num, terms_by_revision) %>%
  rename(classification_description = terms_by_revision) %>%
  na.omit() # there was a wild character there at the end I needed to get rid of

sub.all <- sub.all %>%
  left_join(class, by = "man_num")


###############################
# remove dummy file created by ME
###############################
sub.all = sub.all %>%
  filter(man_num != "APSR-D-20-00881")
# if we ever go back to 2007 - make sure the clean up D-07-00312, b/c it was accidentally rescinded 

colnames(sub.all)


# export cleaned file
write_csv(sub.all, here("data","sub.all.csv"))
saveRDS(sub.all, here("data", "sub.all.rds"))

# export file for codebook 
write_csv(head(sub.all,50), here("data","temp.sub.all.csv"))
write_csv(tail(sub.all,50), here("data","temp2.sub.all.csv"))

